Setup

The settings list settings configure which parts, simulations, and analysis steps of this R markdown are executed. This way single components of the analysis can be disabled for the purpose of saving computation time.

settings <- list(
  # Generate interactive plots of co-expression structures
  investigate.coexpression=F,
  # Perform Differential Expression Analysis
  perform.DEA=T,
  # Generate paper figures
  generate.paper.figs=T,
  # Specify if paper figures are exported
  export.figures = T,
  # Path for file exports
  paper.fig.path = "../danawriteup/figs/",
  # Clears environment
  debug=TRUE
)
if(settings$debug) rm(list=ls()[ls()!="settings"])

Parameter configuration

The config list contains all parameters for the analysis. Remember to always set the working directory and the paths to the data/external files prior to running the code.

config <- list(
  DANA.path = "./R/DANA.R",
  
  ## Data
  case.name = "TCGA",
  # Read count data file for the full data set
  data.file.path = "data/TCGA_harmonized_BRCA_UCS.RData",
  # RData file for coordinates data frame based on miRBase miRNA definitions
  coords.file.path = "data/miRBase.coords.RData",
  
  ## Normalization Methods
  # Specify which normalization methods will be applied
  norm.apply.TC = TRUE,
  norm.apply.UQ = TRUE,
  norm.apply.med = TRUE,
  norm.apply.TMM = TRUE,
  norm.apply.DESeq = TRUE,
  norm.apply.PoissonSeq = TRUE,
  norm.apply.QN = TRUE,
  norm.apply.RUV = TRUE,
  
  # thresholds for zero-expressed, poorly-expressed and well-expressed genes 
  t.zero = 2,  # zero-expressed in [0, t.zero)
  t.poor = 5,  # poorly-expressed in [t.zero, t.well]
  t.well = 100,  # well-expressed in [t.well, inf)
  # distance threshold for clustering
  cluster.threshold = 10000,
  # preprocessing data transformation type: none ("n"), modified log2 ("log2"),
  # or cube root ("cube") available
  preprocess.transform = "log2",
  
  ## Correlation Estimation for positive and negative controls
  # Available methods | Tuning parameter calibration schemes
  # "mb"              | "cv", "aic", "bic", "av"
  # "huge.mb"         | "ric", "stars"
  # "glasso"          | "ric", "stars", "ebic"
  # "fastggm"         | none
  # "pearson"         | none
  # "spearman"        | none
  
  # Positive Controls
  corr.method.pos = "mb",
  tuning.criterion.pos = "bic",
  # Negative Controls
  corr.method.neg = "pearson",
  tuning.criterion.neg = "",
  # Generate plots within DANA
  generate.plots = FALSE
)

Configuration for the differential expression analysis

config.DEA <- list(
  ## Data
  case.name = "TCGA",

  ## Normalization Methods
  # Specify which normalization methods will be applied
  norm.apply.TC = TRUE,
  norm.apply.UQ = TRUE,
  norm.apply.med = TRUE,
  norm.apply.TMM = TRUE,
  norm.apply.DESeq = TRUE,
  norm.apply.PoissonSeq = TRUE,
  norm.apply.QN = TRUE,
  norm.apply.RUV = FALSE,
  
  # Significance level for DEA
  alpha = 0.01,
  # Plots
  generate.volcano.plot = TRUE,
  generate.meanvar.plot = TRUE,
  # Use q-values (adjusted p-values) instead of p-values
  q.values = FALSE,
  # RUV reduces the parameter size. Reduce DEA result to RUV genes?
  perform.subsetting = FALSE
)

Load Packages

Load all required R libraries/packages.

# DANA functions
source(config$DANA.path)
# Libraries
library(openxlsx)  # read excel files
library(corrplot) # For mixed correlation plots
library(cowplot) # Arrange plots
library(RColorBrewer) # Colors
library(latex2exp) # for latex in axis labels 
library(ggcorrplot) # ggplot2 correlation plots

Load Data

Load the dataset under study

We load the data set into memory.

# Load the p x n matrix of read counts into the workspace
load(config$data.file.path)

# Unlist the data
data.BRCA <- TCGA.BRCA.UCS$BRCA
data.UCS <- TCGA.BRCA.UCS$UCS

# Transform gene names to lower case
rownames(data.BRCA)  <- tolower(rownames(data.BRCA))
rownames(data.UCS) <- tolower(rownames(data.UCS))

# Build single RC matrix
data.RC <- data.TCGA <- cbind(data.BRCA, data.UCS) 
data.groups <- c(rep("BRCA", dim(data.BRCA)[2]), rep("UCS", dim(data.UCS)[2]))

# Inspect
cat("Dimensions of the data: ", dim(data.RC), "\n")
## Dimensions of the data:  1881 223

miRNA Coordinates

Load corresponding miRNA chromosome coordinates

A coordinates data frame that specifies the base pair location of each miRNA of the data set on the chromosomes is loaded.

# Load miRNA chromosome location coordinates data frame
load(config$coords.file.path)
coords <- coords  # change according to the name of the loaded data matrix

Remove genes that cannot be found in the coordinates data frame

Some miRNAs map to multiple locations of sequence families. We named the sequence family with a parenthesis that reflects the number of members from all the different genomic locations (e.g. hsa-let-7a(3)). These miRNAs cannot be uniquely mapped to the genome, therefore we must exclude these from the location based analysis.

# only consider genes that are present in the coordinate data frame
genes.in.coords <- rownames(coords)[na.omit(match(rownames(data.RC), rownames(coords)))]
cat(dim(data.RC)[1]-length(genes.in.coords), " genes not found in coords. Reducing data set to ", length(genes.in.coords), "genes.\n")
## 33  genes not found in coords. Reducing data set to  1848 genes.
# test data set
data.RC <- data.RC[genes.in.coords, ]
# benchmark data set
genes <- rownames(data.RC)
rm(genes.in.coords)

Examine the Data

First, we investigate the distribution of mean miRNA expression of the data.

# Histogram plot test data set
par(mfrow=c(1,1))
df <- data.frame(log.expression=log2(rowMeans(data.RC)+1))
print(ggplot(df, aes(x=log.expression)) + 
        geom_histogram(binwidth = .1, color="black", fill="black") +
        geom_vline(xintercept = log2(config$t.zero+1), color="blue", linetype="dashed") +
        geom_vline(xintercept = log2(config$t.poor+1), color="blue", linetype="dashed") +
        geom_vline(xintercept = log2(config$t.well+1), color="red",  linetype="dashed") +
        ggtitle("Test data set"))

rm(df)

We observe that there is a large proportion of poorly-expressed genes. Some of them have extremely low or zero mean expression which corresponds to essentially zero reads.

DANA - Performance Analysis of Normalization Methods

We apply the full analysis pipeline to the data set. This includes:

  1. Apply the normalization methods to the raw counts.
  2. Define positive and negative controls based on read counts.
  3. Estimate the level of coexpression between miRNAs for each data set.
  4. Compare positive and negative controls before and after normalization.

The following parameters are used:

  • Case name: TCGA
  • Control definition:
    • Definition of negative controls, read count in [2, 5]
    • Definition of positive controls, read count in [100, inf]
    • Distance threshold for clustering: 10^{4}
  • Data preprocessing transformation: log2
  • Coexpression estimation:
    • Correlation estimation method for positive controls: mb
    • Correlation estimation tuning parameter calibration method for positive controls: bic
  • Comparison statistics:
    • Generate plots? FALSE
genes <- rownames(data.RC)

## Apply Normalization
data.norm <- normalize(data.RC,
                       groups=data.groups,
                       name=config$case.name,
                       apply.TC=config$norm.apply.TC,
                       apply.UQ=config$norm.apply.UQ,
                       apply.med=config$norm.apply.med,
                       apply.TMM=config$norm.apply.TMM,
                       apply.DESeq=config$norm.apply.DESeq,
                       apply.PoissonSeq=config$norm.apply.PoissonSeq,
                       apply.QN=config$norm.apply.QN,
                       apply.RUV=config$norm.apply.RUV)
## 1096 genes has been filtered because they contains too small number of reads across the experiments.
## Define Controls
pre.res <- define.controls(data.RC,
                           t.zero=config$t.zero,
                           t.poor=config$t.poor,
                           t.well=config$t.well,
                           t.cluster=config$cluster.threshold,
                           coords=coords,
                           clusters=NULL)
## Number of positive control markers with RC in [100, inf): 116
## Number of negative control markers with RC in [2, 5]: 91
pos.controls <- pre.res$genes.pos
neg.controls <- pre.res$genes.neg
clusters <- pre.res$clusters
clusters.data <- clusters


# Apply DANA pipeline
DANA.results <- DANA(data.RC=data.RC, 
                     data.norm=data.norm, 
                     pos.controls=pos.controls, 
                     neg.controls=neg.controls, 
                     clusters=clusters.data,
                     coords=coords,
                     case.name="TCGA",
                     generate.plots=config$generate.plots,
                     preprocess.transform=config$preprocess.transform,
                     corr.method.pos=config$corr.method.pos,
                     tuning.criterion.pos=config$tuning.criterion.pos,
                     corr.method.neg=config$corr.method.neg,
                     tuning.criterion.neg=config$tuning.criterion.neg) 
## Estimating correlations for pos. and neg. controls for data set TCGA...done
## Estimating correlations for pos. and neg. controls for data set TCGA.TMM...done
## Estimating correlations for pos. and neg. controls for data set TCGA.DESeq...done
## Estimating correlations for pos. and neg. controls for data set TCGA.PoissonSeq...done
## Estimating correlations for pos. and neg. controls for data set TCGA.TC...done
## Estimating correlations for pos. and neg. controls for data set TCGA.Med...done
## Estimating correlations for pos. and neg. controls for data set TCGA.RUVg...done
## Estimating correlations for pos. and neg. controls for data set TCGA.RUVs...done
## Estimating correlations for pos. and neg. controls for data set TCGA.RUVr...done
## Estimating correlations for pos. and neg. controls for data set TCGA.UQ...done
## Estimating correlations for pos. and neg. controls for data set TCGA.QN...done
## Comparing models TCGA vs. TCGA.TMM....done
## Comparing models TCGA vs. TCGA.DESeq....done
## Comparing models TCGA vs. TCGA.PoissonSeq....done
## Comparing models TCGA vs. TCGA.TC....done
## Comparing models TCGA vs. TCGA.Med....done
## Comparing models TCGA vs. TCGA.RUVg....done
## Comparing models TCGA vs. TCGA.RUVs....done
## Comparing models TCGA vs. TCGA.RUVr....done
## Comparing models TCGA vs. TCGA.UQ....done
## Comparing models TCGA vs. TCGA.QN....done
if(!config$generate.plots) {
  stargazer(DANA.results$metrics, type="text", summary=FALSE, digits=4, 
            title="Comparison statistics for the TCGA-BRCA/UCS data set", align=TRUE)
}
## 
## Comparison statistics for the TCGA-BRCA/UCS data set
## =============================
##                   cc    mscr 
## -----------------------------
##    TCGA.TMM     0.9904 0.7936
##   TCGA.DESeq    0.9896 0.8031
## TCGA.PoissonSeq 0.9882 0.7958
##     TCGA.TC     0.9927 0.6539
##    TCGA.Med     0.9245 0.5716
##    TCGA.RUVg    0.9854 0.8171
##    TCGA.RUVs    0.8288 0.8551
##    TCGA.RUVr    0.9763 0.8359
##     TCGA.UQ     0.9374 0.6053
##     TCGA.QN     0.9717 0.8308
## -----------------------------
iplot.data.noNorm <- iggplot.corr(DANA.results$data.model$pos$corr, clusters=clusters, title="Positive controls (un-normalized)", coords=coords)
iplot.data.TMM <- iggplot.corr(DANA.results$norm.models$TCGA.TMM$pos$corr, clusters=clusters, title="Positive controls (TMM)", coords=coords)

Investigate Co-expression Structures

We compare the estimated co-expression structures for the test and benchmark dataset. Here, we take a look at the models for the un-normalized data as well as TMM-normalized data. Note that you cannot directly compare the graphs of benchmark with test data since the considered set of genes is not identical.

if(settings$investigate.coexpression) {
  htmltools::tagList(list(iplot.data.noNorm, iplot.data.TMM))  # show plots in markdown
}

Differential Expression Analysis

Normalize Data

if(settings$perform.DEA) {
  ## Reset data
  data.RC <- data.TCGA 
  
  ## Normalize test Data
  data.norm <- normalize(data.RC,
                         groups=data.groups,
                         name="TCGA",
                         apply.TC=config.DEA$norm.apply.TC,
                         apply.UQ=config.DEA$norm.apply.UQ,
                         apply.med=config.DEA$norm.apply.med,
                         apply.TMM=config.DEA$norm.apply.TMM,
                         apply.DESeq=config.DEA$norm.apply.DESeq,
                         apply.PoissonSeq=config.DEA$norm.apply.PoissonSeq,
                         apply.QN=config.DEA$norm.apply.QN,
                         apply.RUV=FALSE)
  # RUV normalization
  if(config.DEA$norm.apply.RUV) {
    data.norm.RUV.adjust <- list(test.RUVr=norm.RUV(data.RC, data.groups, "RUVr")$adjust.factor,
                                 test.RUVs=norm.RUV(data.RC, data.groups, "RUVs")$adjust.factor,
                                 test.RUVg=norm.RUV(data.RC, data.groups, "RUVg")$adjust.factor)
  }

}
## 1124 genes has been filtered because they contains too small number of reads across the experiments.

Test DEA

if(settings$perform.DEA) {
  ## Perform DEA on full dataset
  test.DEA <- DE.voom(data.RC=data.RC, groups=data.groups, plot=config.DEA$generate.meanvar.plot, plot.title="Un-normalized")
  if(config.DEA$generate.volcano.plot) plot.DE.volcano(test.DEA, alpha=config.DEA$alpha, q.values=config.DEA$q.values, title="Un-normalized")
  
  ## DEA for normalized test data
  data.norm.DEA <- list()
  for(i in 1:length(data.norm)) {
    data.norm.DEA <- 
      append(data.norm.DEA, list(DE.voom(data.RC=data.norm[[i]], 
                                         groups=data.groups, 
                                         plot=config.DEA$generate.meanvar.plot,
                                         plot.title=names(data.norm)[i])))
    if(config.DEA$generate.volcano.plot) {
      plot.DE.volcano(data.norm.DEA[[i]], 
                      alpha=config.DEA$alpha, 
                      q.values=config.DEA$q.values, 
                      title=names(data.norm)[i])
    }
  }
  ## DEA for RUV normalization
  if(config.DEA$norm.apply.RUV) {
    for(i in 1:length(data.norm.RUV.adjust)) {
      data.norm.DEA <- 
        append(data.norm.DEA, list(DE.voom(data.RC=data.RC, 
                                           groups=data.groups, 
                                           plot=config.DEA$generate.meanvar.plot,
                                           plot.title=names(data.norm.RUV.adjust)[i],
                                           adjust=data.norm.RUV.adjust[[i]])))
      if(config.DEA$generate.volcano.plot) {
        plot.DE.volcano(data.norm.DEA[[i]], 
                        alpha=config.DEA$alpha, 
                        q.values=config.DEA$q.values, 
                        title=names(data.norm.RUV.adjust)[i])
      }
    }
    names(data.norm.DEA) <- c(names(data.norm), names(data.norm.RUV.adjust))
  } else {
    names(data.norm.DEA) <- names(data.norm)
  }
}

## number of DE genes: 1515

## number of DE genes: 1552

## number of DE genes: 1586

## number of DE genes: 1243

## number of DE genes: 597

## number of DE genes: 1198

## number of DE genes: 1134

## number of DE genes: 554

Results and Figures for Paper

Number of samples, markers, and positive and negative controls

if(settings$generate.paper.figs) {
  
# Dimension of data after analysis
cat("Data:\n")
cat("  - Dimensions: p=", dim(data.RC)[1],", n=", dim(data.RC)[2], "\n", sep="")
cat("  - Positive controls:\n")
cat("    * Definition mean RC in interval: [", config$t.well, ", inf ]\n", sep="")
cat("    * Number of positive controls miRNAs:", length(pos.controls), "\n")
cat("  - Negative controls:\n")
cat("    * Definition mean RC in interval: [", config$t.zero, ",", config$t.poor, "]\n")
cat("    * Number of negative controls miRNAs:", length(neg.controls), "\n")

}
## Data:
##   - Dimensions: p=1881, n=223
##   - Positive controls:
##     * Definition mean RC in interval: [100, inf ]
##     * Number of positive controls miRNAs: 116 
##   - Negative controls:
##     * Definition mean RC in interval: [ 2 , 5 ]
##     * Number of negative controls miRNAs: 91

Read Count Distribution in the Data

Per Data-Set

if(settings$generate.paper.figs) {
  par(mfrow=c(1,1))
  
  # Histogram plot test data set
  data.RC.log2 <- log2(rowMeans(data.RC)+1)
  df <- data.frame(log.expression=data.RC.log2)
  p.data.RC.hist <- ggplot(df, aes(x=log.expression)) + 
    geom_histogram(binwidth = .1, color="black", fill="black") +
    geom_vline(xintercept = log2(config$t.zero+1), color="blue", linetype="dashed") +
    geom_vline(xintercept = log2(config$t.poor+1), color="blue", linetype="dashed") +
    geom_vline(xintercept = log2(config$t.well+1), color="red",  linetype="dashed") +
    ggtitle("TCGA-BRCA/UCS data set") +
    theme_classic()
  print(p.data.RC.hist)

  
  p.RC.hist <- ggplot(df) + 
    theme_classic() +
    geom_histogram(aes(x=log.expression, y=..count..), binwidth = .1) +
    geom_vline(xintercept = log2(config$t.zero+1), color="#5851b8", linetype="twodash") +
    geom_vline(xintercept = log2(config$t.poor+1), color="#5851b8", linetype="twodash") +
    geom_vline(xintercept = log2(config$t.well+1), color="#E7298A",  linetype="longdash") +
    geom_segment(aes(x = log2(config$t.zero+1), y = 600, xend = log2(config$t.poor+1), yend = 600),
                 arrow=arrow(length=unit(.07, "in"), ends="both"),
                 color="#5851b8") +
    annotate(geom="label", x=(log2(config$t.zero+1)+log2(config$t.poor+1))/2.0, y=630,
             label="neg. contr.", color="#5851b8", size=4, fill = alpha(c("white"),0.85), label.size = NA) +
    geom_segment(aes(x = log2(config$t.well+1), y = 600, xend = log2(config$t.well+1)+4, yend = 600),
                 arrow=arrow(length=unit(.07, "in"), ends="last"),
                 color="#E7298A") +
    annotate(geom="label", x=log2(config$t.well+1)+.5, y=630,
             label="pos. contr.", color="#E7298A", size=4, hjust=0, fill = alpha(c("white"),0.85), label.size = NA) +
    xlab("Mean log2(read count)") +
    ylab("Frequency")
  print(p.RC.hist)
  
    ## Export Plots
  if(settings$export.figures) {
    ggsave(paste0(settings$paper.fig.path, "BRCA_UCS_RC_Distribution.pdf"), p.RC.hist, width=5, height=4, device="pdf")
  }
  
}

Mean-Variance Plot

Distribution property among technical replicates

if(settings$generate.paper.figs) {
  # Function for mean-variance plot for a given data set
  mean.variance.plot <- function(data.RC, title, axis.limit=NULL) {
    df <- data.frame(data.mean=log10(rowMeans(data.RC) + 1),
                     data.var =log10(rowVars(data.RC) + 1))
    lin.fit <- lm(df$data.var ~ df$data.mean)$coefficients
    p <- ggplot(df,aes(x=data.mean,y=data.var)) + 
      geom_point(alpha=.25) + 
      xlab("log10(Mean)") + 
      ylab("log10(Variance)") + 
      geom_abline(intercept = lin.fit[1], slope = lin.fit[2], color="red", linetype="longdash", size=1) +
      geom_abline(intercept = 0, slope = 1, colour="blue") +
      annotate("text", alpha=1, x=4, y=0.5, hjust=0, vjust=0, size=3, colour="red",
               label=paste0("log10(Variance) = ", round(lin.fit[1],2), " + ", round(lin.fit[2],2), "*log10(Mean)")) + 
      xlim(0, ifelse(is.null(axis.limit), max(df), axis.limit)) +
      ylim(0, ifelse(is.null(axis.limit), max(df), axis.limit)) +
      ggtitle(title) +
      theme(aspect.ratio=1, plot.title = element_text(hjust = 0.5)) +
      theme_bw()
    return(p)
  }
  
  # Both subtypes
  p.meanvar <- mean.variance.plot(data.RC, "Full data set", axis.limit=12.5)
  print(p.meanvar)
  # BRCA
  p.meanvar.BRCA <- mean.variance.plot(data.RC[, data.groups=="BRCA"], "BRCA", axis.limit=12.5)
  print(p.meanvar.BRCA)
  # UCS
  p.meanvar.UCS <- mean.variance.plot(data.RC[, data.groups=="UCS"], "UCS", axis.limit=12.5)
  print(p.meanvar.UCS)
}

Marker-specific sd vs mean

if(settings$generate.paper.figs) {
  
  # Both subtypes
  p.meansd <- plot.mean.sd(data.RC, config$t.zero, config$t.poor, config$t.well, "Full data set")
  print(p.meansd)
  # BRCA
  p.meansd.BRCA <- plot.mean.sd(data.RC[, data.groups=="BRCA"], config$t.zero, config$t.poor, config$t.well, "BRCA")
  print(p.meansd.BRCA)
  # UCS
  p.meansd.UCS <- plot.mean.sd(data.RC[, data.groups=="UCS"], config$t.zero, config$t.poor, config$t.well, "UCS")
  print(p.meansd.UCS)
  
  
  ## Export Plots
  if(settings$export.figures) {
    ggsave(paste0(settings$paper.fig.path, "BRCA_UCS_MeanSD.pdf"), p.meansd + labs(title = NULL), width = 5, height=4, device="pdf")
  }
}

Correlation Plots

if(settings$generate.paper.figs) {
  num.genes.plot.pos <- 60
  num.genes.plot.neg <- 60

  # Co-expression models
  data.model <- DANA.results$data.model
  data.norm.model <- DANA.results$norm.models$TCGA.DESeq
  
  # Common control miRNAs
  pos.controls <- rownames(data.model$pos$corr)
  pos.controls.norm <- rownames(data.norm.model$pos$corr)
  pos.controls.common <- intersect(pos.controls, pos.controls.norm)
  neg.controls <- rownames(data.model$neg$corr)
  neg.controls.norm <- rownames(data.norm.model$neg$corr)
  # reduce the number of genes for the plot
  num.genes.plot.pos <- min(num.genes.plot.pos, length(pos.controls.common))
  pos.controls.plot <- pos.controls.common[1:num.genes.plot.pos]
  
  num.genes.plot.neg <- min(num.genes.plot.neg,
                            dim(data.model$neg$corr)[1],
                            dim(data.norm.model$neg$corr)[1])
  
  # Subsetted correlation matrices
  corr.data.pos <- data.model$pos$corr[pos.controls.plot, pos.controls.plot]
  corr.data.DESeq.pos <- data.norm.model$pos$corr[pos.controls.plot, pos.controls.plot]
  corr.data.neg <- data.model$neg$corr[1:num.genes.plot.neg, 1:num.genes.plot.neg]
  corr.data.DESeq.neg <- data.norm.model$neg$corr[1:num.genes.plot.neg, 1:num.genes.plot.neg]
  
  
  ## Generate plots
  # Positive controls
  p.corr.pos <- ggplot.corr(corr.data.pos, 
                                 clusters=clusters, 
                                 threshold=0, 
                                 title="Un-normalized",
                                 coords=coords)
  p.corr.pos.DESeq <- ggplot.corr(corr.data.DESeq.pos, 
                                     clusters=clusters, 
                                     threshold=0, 
                                     title="DESeq",
                                     coords=coords)
  
  
  p.corr.pos.two <- arrangeGrob(p.corr.pos.DESeq + theme(legend.position="none"),
                                p.corr.pos + theme(legend.position="none"),
                                get.legend(p.corr.pos.DESeq + theme(legend.position = "bottom")),
                                layout_matrix=rbind(c(1,2), c(4,4)),
                                heights = c(5,1))
  grid.arrange(p.corr.pos.two)  # display plot
  
  # Negative controls
  p.corr.neg <- ggplot.corr(corr.data.neg, 
                            clusters=clusters, 
                            threshold=0, 
                            title="Un-normalized")
  p.corr.neg.DESeq <- ggplot.corr(corr.data.DESeq.neg, 
                                  clusters=clusters, 
                                  threshold=0, 
                                  title="DESeq")
  
  p.corr.neg.two <- arrangeGrob(p.corr.neg.DESeq + theme(legend.position="none"),
                                p.corr.neg + theme(legend.position="none"),
                                get.legend(p.corr.neg.DESeq + theme(legend.position = "bottom")),
                                layout_matrix=rbind(c(1,2), c(4,4)),
                                heights = c(5,1))
  grid.arrange(p.corr.neg.two)  # display plot
  
  
  ## Export Plots
  if(settings$export.figures) {
    ggsave(paste0(settings$paper.fig.path, "BRCA_UCS_CorrPos_DESeq.pdf"), p.corr.pos.two, width = 5, height=3.5, device="pdf")
    ggsave(paste0(settings$paper.fig.path, "BRCA_UCS_CorrNeg_DESeq.pdf"), p.corr.neg.two, width = 5, height=3.5, device="pdf")
  }
}
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

All normalization methods

if(settings$generate.paper.figs) { 
  p.corr.noNorm <- ggplot.corr(DANA.results$data.model$pos$corr,
                             clusters=clusters,
                             title="un-normalized")
  p.corr.TMM <- ggplot.corr(DANA.results$norm.models$TCGA.TMM$pos$corr,
                            clusters=clusters,
                            title="TMM")
  p.corr.DESeq <- ggplot.corr(DANA.results$norm.models$TCGA.DESeq$pos$corr,
                              clusters=clusters,
                              title="DESeq")
  p.corr.PoissonSeq <- ggplot.corr(DANA.results$norm.models$TCGA.PoissonSeq$pos$corr,
                                   clusters=clusters,
                                   title="PoissonSeq")
  p.corr.TC <- ggplot.corr(DANA.results$norm.models$TCGA.TC$pos$corr,
                           clusters=clusters,
                           title="TC")
  p.corr.Med <- ggplot.corr(DANA.results$norm.models$TCGA.Med$pos$corr,
                            clusters=clusters,
                            title="Med")
  p.corr.RUVg <- ggplot.corr(DANA.results$norm.models$TCGA.RUVg$pos$corr,
                             clusters=clusters,
                             title="RUVg")
  p.corr.RUVs <- ggplot.corr(DANA.results$norm.models$TCGA.RUVs$pos$corr,
                             clusters=clusters,
                             title="RUVs")
  p.corr.RUVr <- ggplot.corr(DANA.results$norm.models$TCGA.RUVr$pos$corr,
                             clusters=clusters,
                             title="RUVr")
  p.corr.UQ <- ggplot.corr(DANA.results$norm.models$TCGA.UQ$pos$corr,
                           clusters=clusters,
                           title="UQ")
  p.corr.QN <- ggplot.corr(DANA.results$norm.models$TCGA.QN$pos$corr,
                           clusters=clusters,
                           title="QN")
  
  # Arrange plots
  p.corr.all <-
    plot_grid(p.corr.noNorm + theme(legend.position="none"),
              p.corr.TMM + theme(legend.position="none"),
              p.corr.DESeq + theme(legend.position="none"),
              p.corr.PoissonSeq + theme(legend.position="none"),
              p.corr.TC + theme(legend.position="none"),
              p.corr.Med + theme(legend.position="none"),
              p.corr.RUVg + theme(legend.position="none"),
              p.corr.RUVs + theme(legend.position="none"),
              p.corr.RUVr + theme(legend.position="none"),
              p.corr.UQ + theme(legend.position="none"),
              p.corr.QN + theme(legend.position="none"),
              get.legend(p.corr.noNorm + theme(legend.position = "bottom")),
              ncol=3)
  plot(p.corr.all)
  
  if(settings$export.figures) {
    ggsave(paste0(settings$paper.fig.path, "BRCA_UCS_Corr_All.pdf"), p.corr.all, width=9, height=12, device="pdf")
  }
}
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

Correlation Frequency Plots

if(settings$generate.paper.figs) {
  models <- list(
    "no-normalization" = DANA.results$data.model,
    TMM = DANA.results$norm.models$TCGA.TMM,
    DESeq = DANA.results$norm.models$TCGA.DESeq,
    PoissonSeq = DANA.results$norm.models$TCGA.PoissonSeq,
    RUVg = DANA.results$norm.models$TCGA.RUVg,
    RUVs = DANA.results$norm.models$TCGA.RUVs,
    RUVr = DANA.results$norm.models$TCGA.RUVr,
    TC = DANA.results$norm.models$TCGA.TC,
    Med = DANA.results$norm.models$TCGA.Med,
    UQ = DANA.results$norm.models$TCGA.UQ,
    QN = DANA.results$norm.models$TCGA.QN
  )
  
  # Set number of genes for negative correlation to minimum found
  num.genes.plot <- min(sapply(models, function(x) dim(x$neg$corr)[1]))
  
  # Subsetted correlation matrices
  corrs <- lapply(models, function(x) x$neg$corr[1:num.genes.plot, 1:num.genes.plot])
  corrs <- lapply(corrs, function(x) x[upper.tri(x)])
  
  control <- factor(
    as.vector(sapply(names(corrs), function(x) rep(x,length(corrs[[1]])))),
    levels = names(models) 
  )
  neg.corr <- data.frame(
    control=factor(control),
    corr=unname(unlist(corrs))
  )
  
  # plot insert
  p.insert <- ggplot(neg.corr, aes(x=corr, colour=control, linetype=control)) +
    geom_vline(xintercept = 0, size=.4, color="gray80", linetype="solid") +
    geom_freqpoly(binwidth=.05, alpha=.9, size=.3) +
    scale_color_manual(values=brewer.pal(n=12, name="Paired")) +
    theme_bw() +
    xlim(-1, 1) +
    theme(legend.position="none",
          axis.title.x = element_blank(),
          axis.title.y = element_blank())
  
  
  p.corr.frequency.neg.controls.all <- ggplot(neg.corr, aes(x=corr, colour=control, linetype=control)) +
    geom_vline(xintercept = 0, size=.4, color="gray80", linetype="solid") +
    geom_freqpoly(binwidth=.05, alpha=.9, size=.75) +
    theme_classic() +
    xlim(-1, 1) +
    ylim(0,600) +
    scale_color_manual(values=brewer.pal(n=12, name="Paired")) +
    theme(legend.position=c(0.85,0.8), 
          legend.title=element_blank(),
          legend.background=element_rect(fill=alpha('white', 0.5))) +
    ylab("Frequency") +
    xlab("Marginal correlation") + 
    theme(legend.key.width = unit(2.4,"cm")) +
    annotation_custom(ggplotGrob(p.insert), xmin=-1.1, xmax=-0.3, ymin=300, ymax=600)
  print(p.corr.frequency.neg.controls.all)
  
  
  if(settings$export.figures) {
    ggsave(paste0(settings$paper.fig.path, "BRCA_UCS_CorrDensityNeg_Freq_All.pdf"), p.corr.frequency.neg.controls.all, width = 8, height=8, device="pdf")
  }
}
## Warning: Removed 22 row(s) containing missing values (geom_path).

## Warning: Removed 22 row(s) containing missing values (geom_path).

## Warning: Removed 22 row(s) containing missing values (geom_path).

Correlation Scatter Plots

if(settings$generate.paper.figs) {
  par(mfrow=c(1,1))
  
  # un-normalized vs TMM
  p.scatter.TMM <- plot.corr.scatter(
    corr1=DANA.results$data.model$pos$corr, 
    corr2=DANA.results$norm.models$TCGA.TMM$pos$corr,
    clusters=clusters, title="TMM", xlab="un-normalized", ylab="TMM", 
    ccc=round(DANA.results$metrics["TCGA.TMM", "cc"],3))
  # un-normalized vs DESeq
  p.scatter.DESeq <- plot.corr.scatter(
    corr1=DANA.results$data.model$pos$corr, 
    corr2=DANA.results$norm.models$TCGA.DESeq$pos$corr,
    clusters=clusters, title="DESeq", xlab="un-normalized", ylab="DESeq", 
    ccc=round(DANA.results$metrics["TCGA.DESeq", "cc"],3))
  # un-normalized vs PoissonSeq
  p.scatter.PoissonSeq <- plot.corr.scatter(
    corr1=DANA.results$data.model$pos$corr, 
    corr2=DANA.results$norm.models$TCGA.PoissonSeq$pos$corr,
    clusters=clusters, title="PoissonSeq", xlab="un-normalized", ylab="PoissonSeq", 
    ccc=round(DANA.results$metrics["TCGA.PoissonSeq", "cc"],3))
  # un-normalized vs TC
  p.scatter.TC <- plot.corr.scatter(
    corr1=DANA.results$data.model$pos$corr, 
    corr2=DANA.results$norm.models$TCGA.TC$pos$corr,
    clusters=clusters, title="TC", xlab="un-normalized", ylab="TC", 
    ccc=round(DANA.results$metrics["TCGA.TC", "cc"],3))
  # un-normalized vs Med
  p.scatter.Med <- plot.corr.scatter(
    corr1=DANA.results$data.model$pos$corr, 
    corr2=DANA.results$norm.models$TCGA.Med$pos$corr,
    clusters=clusters, title="Med", xlab="un-normalized", ylab="Med", 
    ccc=round(DANA.results$metrics["TCGA.Med", "cc"],3))
  # un-normalized vs RUVg
  p.scatter.RUVg <- plot.corr.scatter(
    corr1=DANA.results$data.model$pos$corr, 
    corr2=DANA.results$norm.models$TCGA.RUVg$pos$corr,
    clusters=clusters, title="RUVg", xlab="un-normalized", ylab="RUVg", 
    ccc=round(DANA.results$metrics["TCGA.RUVg", "cc"],3))
  # un-normalized vs RUVs
  p.scatter.RUVs <- plot.corr.scatter(
    corr1=DANA.results$data.model$pos$corr, 
    corr2=DANA.results$norm.models$TCGA.RUVs$pos$corr,
    clusters=clusters, title="RUVs", xlab="un-normalized", ylab="RUVs", 
    ccc=round(DANA.results$metrics["TCGA.RUVs", "cc"],3))
  # un-normalized vs RUVr
  p.scatter.RUVr <- plot.corr.scatter(
    corr1=DANA.results$data.model$pos$corr, 
    corr2=DANA.results$norm.models$TCGA.RUVr$pos$corr,
    clusters=clusters, title="RUVr", xlab="un-normalized", ylab="RUVr", 
    ccc=round(DANA.results$metrics["TCGA.RUVr", "cc"],3))
  # un-normalized vs UQ
  p.scatter.UQ <- plot.corr.scatter(
    corr1=DANA.results$data.model$pos$corr, 
    corr2=DANA.results$norm.models$TCGA.UQ$pos$corr,
    clusters=clusters, title="UQ", xlab="un-normalized", ylab="UQ", 
    ccc=round(DANA.results$metrics["TCGA.UQ", "cc"],3))
  # un-normalized vs QN
  p.scatter.QN <- plot.corr.scatter(
    corr1=DANA.results$data.model$pos$corr, 
    corr2=DANA.results$norm.models$TCGA.QN$pos$corr,
    clusters=clusters, title="QN", xlab="un-normalized", ylab="QN", 
    ccc=round(DANA.results$metrics["TCGA.QN", "cc"],3))
  
  p.scatter.TCGA.all <- plot_grid(p.scatter.TMM,
                                  p.scatter.DESeq,
                                  p.scatter.PoissonSeq,
                                  p.scatter.TC,
                                  p.scatter.Med,
                                  p.scatter.RUVg,
                                  p.scatter.RUVs,
                                  p.scatter.RUVr,
                                  p.scatter.UQ,
                                  p.scatter.QN,
                                  ncol = 3,
                                  align="hv")
  plot(p.scatter.TCGA.all)
  
  
  # Save plots
  if(settings$export.figures) {
    ggsave(paste0(settings$paper.fig.path, "BRCA_UCS_Scatter_All.pdf"), p.scatter.TCGA.all, width=9, height=12, device="pdf")
  }
}
## Warning in plot.corr.scatter(corr1 = DANA.results$data.model$pos$corr, corr2 = DANA.results$norm.models$TCGA.RUVs$pos$corr, : Warning: Shapes of correlation matrices in plot.corr.scatter do not match!
##             Reducing correlation matrices to all mutual genes.

DANA Result Metrics

if(settings$generate.paper.figs) { 
  options(scipen=4) # force non-scientific notation of x axis
  
  test.DANA.metrics <- data.frame(
    method=substr(rownames(DANA.results$metrics), 6, 20),
    cc=DANA.results$metrics[, "cc"],
    mscr=DANA.results$metrics[, "mscr"]
  )
  
  p.DANA.metrics <- ggplot(test.DANA.metrics,aes(x=mscr,y=cc, label=method)) + 
    geom_point(alpha=.5) +
    theme_classic() + 
    xlab(TeX("$mscr^-$; Relative reduction of handling effects")) + 
    ylab(TeX("$cc^+$; Biological signal preservation")) + 
    geom_text_repel(aes(label = method), size=3, max.overlaps = Inf, min.segment.length = 0.3) +
    # xlim(c(min(c(0, test.mposc.reduction)),max(test.mposc.reduction))) +
    scale_x_continuous(labels = scales::percent, limits=c(0,1.05), 
                       breaks = scales::pretty_breaks(n = 5)) +
    # ylim(c(0,1)) 
    scale_y_continuous(labels = scales::percent, limits = c(0.5,1))
  print(p.DANA.metrics)
  
  print("DANA Statistics for the TCGA-BRCA/UCS")
  test.DANA.metrics$cc <- round(test.DANA.metrics$cc,3)
  test.DANA.metrics$mscr <- round(test.DANA.metrics$mscr,3)
  print(test.DANA.metrics)
  
  
  
  if(settings$export.figures) {
    ggsave(paste0(settings$paper.fig.path, "BRCA_UCS_DANAMetrics.pdf"), p.DANA.metrics, width=4.5, height=3.5, device="pdf")
  }
  
}
## [1] "DANA Statistics for the TCGA-BRCA/UCS"
##        method    cc  mscr
## 1         TMM 0.990 0.794
## 2       DESeq 0.990 0.803
## 3  PoissonSeq 0.988 0.796
## 4          TC 0.993 0.654
## 5         Med 0.924 0.572
## 6        RUVg 0.985 0.817
## 7        RUVs 0.829 0.855
## 8        RUVr 0.976 0.836
## 9          UQ 0.937 0.605
## 10         QN 0.972 0.831

Package Information

sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] de_DE.UTF-8/de_DE.UTF-8/de_DE.UTF-8/C/de_DE.UTF-8/de_DE.UTF-8
## 
## attached base packages:
## [1] splines   stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ggcorrplot_0.1.3            latex2exp_0.5.0            
##  [3] RColorBrewer_1.1-2          cowplot_1.1.1              
##  [5] openxlsx_4.2.4              ffpe_1.38.0                
##  [7] TTR_0.24.2                  DescTools_0.99.44          
##  [9] vsn_3.62.0                  RUVSeq_1.28.0              
## [11] EDASeq_2.28.0               ShortRead_1.52.0           
## [13] GenomicAlignments_1.30.0    Rsamtools_2.10.0           
## [15] Biostrings_2.62.0           XVector_0.34.0             
## [17] sva_3.42.0                  BiocParallel_1.28.2        
## [19] genefilter_1.76.0           mgcv_1.8-38                
## [21] nlme_3.1-153                PoissonSeq_1.1.2           
## [23] combinat_0.0-8              DESeq2_1.34.0              
## [25] SummarizedExperiment_1.24.0 Biobase_2.54.0             
## [27] MatrixGenerics_1.6.0        matrixStats_0.61.0         
## [29] GenomicRanges_1.46.1        GenomeInfoDb_1.30.0        
## [31] IRanges_2.28.0              S4Vectors_0.32.3           
## [33] BiocGenerics_0.40.0         edgeR_3.36.0               
## [35] limma_3.50.0                FastGGM_1.0                
## [37] RcppParallel_5.1.4          Rcpp_1.0.7                 
## [39] huge_1.3.5                  glmnet_4.1-3               
## [41] Matrix_1.3-4                ggrepel_0.9.1              
## [43] plotly_4.10.0               stargazer_5.2.2            
## [45] corrplot_0.92               ggnewscale_0.4.5           
## [47] gridExtra_2.3               ggplot2_3.3.5              
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.2                R.utils_2.11.0           
##   [3] tidyselect_1.1.1          RSQLite_2.2.9            
##   [5] AnnotationDbi_1.56.2      htmlwidgets_1.5.4        
##   [7] grid_4.1.2                munsell_0.5.0            
##   [9] codetools_0.2-18          preprocessCore_1.56.0    
##  [11] nleqslv_3.3.2             withr_2.4.3              
##  [13] colorspace_2.0-2          filelock_1.0.2           
##  [15] highr_0.9                 knitr_1.36               
##  [17] rstudioapi_0.13           labeling_0.4.2           
##  [19] GenomeInfoDbData_1.2.7    hwriter_1.3.2            
##  [21] farver_2.1.0              bit64_4.0.5              
##  [23] rhdf5_2.38.0              vctrs_0.3.8              
##  [25] generics_0.1.1            xfun_0.28                
##  [27] BiocFileCache_2.2.0       R6_2.5.1                 
##  [29] illuminaio_0.36.0         locfit_1.5-9.4           
##  [31] reshape_0.8.8             bitops_1.0-7             
##  [33] rhdf5filters_1.6.0        cachem_1.0.6             
##  [35] DelayedArray_0.20.0       assertthat_0.2.1         
##  [37] BiocIO_1.4.0              scales_1.1.1             
##  [39] rootSolve_1.8.2.3         gtable_0.3.0             
##  [41] affy_1.72.0               methylumi_2.40.1         
##  [43] lmom_2.8                  rlang_0.4.12             
##  [45] rtracklayer_1.54.0        lazyeval_0.2.2           
##  [47] GEOquery_2.62.1           BiocManager_1.30.16      
##  [49] yaml_2.2.1                crosstalk_1.2.0          
##  [51] GenomicFeatures_1.46.1    tools_4.1.2              
##  [53] nor1mix_1.3-0             affyio_1.64.0            
##  [55] ellipsis_0.3.2            lumi_2.46.0              
##  [57] jquerylib_0.1.4           siggenes_1.68.0          
##  [59] proxy_0.4-26              plyr_1.8.6               
##  [61] sparseMatrixStats_1.6.0   progress_1.2.2           
##  [63] zlibbioc_1.40.0           purrr_0.3.4              
##  [65] RCurl_1.98-1.5            prettyunits_1.1.1        
##  [67] openssl_1.4.5             bumphunter_1.36.0        
##  [69] zoo_1.8-9                 sfsmisc_1.1-12           
##  [71] magrittr_2.0.1            data.table_1.14.2        
##  [73] mvtnorm_1.1-3             aroma.light_3.24.0       
##  [75] hms_1.1.1                 evaluate_0.14            
##  [77] xtable_1.8-4              XML_3.99-0.8             
##  [79] jpeg_0.1-9                mclust_5.4.8             
##  [81] shape_1.4.6               compiler_4.1.2           
##  [83] biomaRt_2.50.1            minfi_1.40.0             
##  [85] tibble_3.1.6              KernSmooth_2.23-20       
##  [87] crayon_1.4.2              R.oo_1.24.0              
##  [89] htmltools_0.5.2           tzdb_0.2.0               
##  [91] tidyr_1.1.4               geneplotter_1.72.0       
##  [93] expm_0.999-6              Exact_3.1                
##  [95] DBI_1.1.1                 dbplyr_2.1.1             
##  [97] MASS_7.3-54               rappdirs_0.3.3           
##  [99] boot_1.3-28               readr_2.1.1              
## [101] quadprog_1.5-8            R.methodsS3_1.8.1        
## [103] parallel_4.1.2            igraph_1.2.9             
## [105] pkgconfig_2.0.3           xml2_1.3.3               
## [107] foreach_1.5.1             annotate_1.72.0          
## [109] rngtools_1.5.2            multtest_2.50.0          
## [111] beanplot_1.2              doRNG_1.8.2              
## [113] scrime_1.3.5              stringr_1.4.0            
## [115] digest_0.6.29             base64_2.0               
## [117] rmarkdown_2.11            gld_2.6.3                
## [119] DelayedMatrixStats_1.16.0 restfulr_0.0.13          
## [121] curl_4.3.2                rjson_0.2.20             
## [123] lifecycle_1.0.1           jsonlite_1.7.2           
## [125] Rhdf5lib_1.16.0           askpass_1.1              
## [127] viridisLite_0.4.0         fansi_0.5.0              
## [129] pillar_1.6.4              lattice_0.20-45          
## [131] KEGGREST_1.34.0           fastmap_1.1.0            
## [133] httr_1.4.2                survival_3.2-13          
## [135] glue_1.5.1                xts_0.12.1               
## [137] zip_2.2.0                 png_0.1-7                
## [139] iterators_1.0.13          bit_4.0.4                
## [141] class_7.3-19              stringi_1.7.6            
## [143] HDF5Array_1.22.1          blob_1.2.2               
## [145] latticeExtra_0.6-29       memoise_2.0.1            
## [147] dplyr_1.0.7               e1071_1.7-9